Choosing a Species

# original species data
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- FALSE

# cleaned species data
obs_clean <- file.path(dir_data, "obs_clean.csv")
obs_clean_geo <- file.path(dir_data, "obs_clean.geojson")


# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
  limit = 10000,
  query = 'Betula lenta', 
  from = 'gbif', has_coords = T))
## Searched: gbif
## Occurrences - Found: 2,667, Returned: 2,667
## Search type: Scientific
##   gbif: Betula lenta (2667)
# extract data frame from result
betula_df <- res$gbif$data[[1]] 
count <- nrow(betula_df) # number of rows

View All Results

# convert to points of observation from lon/lat columns in data frame
obs <- betula_df %>% 
  sf::st_as_sf(
    coords = c("longitude", "latitude"),
    crs = st_crs(4326)) %>% 
  select(1,27:80,163)

# create .csv and sf object
readr::write_csv(betula_df, obs_csv)
sf::write_sf(obs, obs_geo)
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Error 6: DeleteLayer() not supported by
## this dataset.
# show points on map
mapview::mapview(obs, map.types = "OpenTopoMap")

Data Cleaning

# remove duplicate observations
dups2 <- duplicated(betula_df[, c('longitude', 'latitude')])
sum(dups2)
## [1] 395
betula_clean <- betula_df[!dups2, ]

# remove values outside of Eastern US
betula_clean <- betula_clean %>% 
  filter(latitude > 31,
         latitude < 47,
         longitude > -90,
         longitude < -69
)

View Clean Results

# convert to points of observation from lon/lat columns in data frame
obs_cleaned <- betula_clean %>% 
  sf::st_as_sf(
    coords = c("longitude", "latitude"),
    crs = st_crs(4326)) %>% 
  select(1,27:80,163)

readr::write_csv(betula_clean, obs_clean)
sf::write_sf(obs_cleaned, obs_clean_geo)
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Error 6: DeleteLayer() not supported by
## this dataset.
# show points on map
mapview::mapview(obs_cleaned, map.types = "OpenTopoMap")

Question 1: How many observations total are in GBIF for your species? (Hint: ?occ)

  • Response - 2667`

Question 2. Do you see any odd observations, like marine species on land or vice versa? If so, please see the Data Cleaning and explain what you did to fix or remove these points.

  • Yes. I removed a couple duplicate values. I also checked out the Flora of North America which provides an overview of of the native species distribution. From there, I removed observations outside eastern United States. Observations outside this range are not considered “habitat.” Therefore, I filtered observations to keep only those within the eastern US.

Presence

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)

Environmental Layer Selection

# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

Convex Hull Creation

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | TRUE){
  
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs_cleaned))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Error 6: DeleteLayer() not supported by
## this dataset.
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs_cleaned, obs_hull), map.types = "OpenTopoMap")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)

Environmental Plot Subset

if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Generate Pseudo-Absence Points

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs_cleaned), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs_cleaned)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs_cleaned, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)

Combine Presence-Absence Points

if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(obs_cleaned %>% 
      mutate(present = 1) %>% 
      select(present),
    absence %>% 
      mutate(present = 0)) %>% 
    mutate(ID = 1:n()) %>% 
    relocate(ID)
  
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
## Rows: 4450 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): ID, present, WC_alt, WC_bio1, WC_bio2, ER_tri, ER_topoWet, lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))